Make input for for Pavian Sankey Plot
neon_jgi_pilot_ind_pavian <- read_csv("neon-jgi_pilot_ind_assembly_bins.csv") %>%
select(`GTDB-Tk Taxonomy Lineage`) %>%
mutate(`GTDB-Tk Taxonomy Lineage` = as.factor(`GTDB-Tk Taxonomy Lineage`)) %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace_all, "; ", "\\|") %>%
count(`GTDB-Tk Taxonomy Lineage`)
## Rows: 1130 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): Bin ID, Genome Name, Bin Quality, Bin Lineage, GTDB-Tk Taxonomy L...
## dbl (10): IMG Genome ID, Bin Completeness, Bin Contamination, Total Number ...
## date (1): Date Added
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
write_tsv(neon_jgi_pilot_ind_pavian, "neon_jgi_pilot_ind_pavian.tsv")
neon_jgi_pilot_ind_bins <- read_csv("neon-jgi_pilot_ind_assembly_bins.csv") %>%
# out taxa categories in separate columns
# IMG only reports to the species level
rename(`Completeness` = `Bin Completeness`) %>%
rename(`Contamination` = `Bin Contamination`) %>%
rename(`Site` = `Genome Name`) %>%
# mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "d__", "") %>%
# mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "p__", "") %>%
# mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "c__", "") %>%
# mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "o__", "") %>%
# mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "f__", "") %>%
# mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "g__", "") %>%
# mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "s__", "") %>%
separate(`GTDB-Tk Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus"), "; ") %>%
# Simplify Site name
mutate_at("Site", str_replace, "Soil microbial communities from ", "") %>%
separate(`Site`, c("Site","Sample Name"), " - ") %>%
mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>%
separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-",)
## Rows: 1130 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): Bin ID, Genome Name, Bin Quality, Bin Lineage, GTDB-Tk Taxonomy L...
## dbl (10): IMG Genome ID, Bin Completeness, Bin Contamination, Total Number ...
## date (1): Date Added
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: Expected 6 pieces. Additional pieces discarded in 21 rows [92, 131, 132, 228,
## 252, 575, 576, 678, 723, 730, 824, 825, 826, 827, 828, 891, 925, 946, 981, 996,
## ...].
## Warning: Expected 6 pieces. Missing pieces filled with `NA` in 282 rows [39, 40, 41, 57,
## 59, 64, 65, 66, 93, 94, 133, 145, 147, 148, 149, 172, 176, 181, 186, 187, ...].
write_tsv(neon_jgi_pilot_ind_bins, "neon_jgi_pilot_ind_bins.tsv")